library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## %*%, %in%, &&, apply, as.factor, as.numeric, colnames, colnames<-,
## ifelse, is.character, is.factor, is.numeric, log, log10, log1p,
## log2, round, signif, trunc, ||
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(lme4)
## Loading required package: Matrix
library(ggalluvial)
library(xgboost)
library(jsonlite)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:h2o':
##
## day, hour, month, week, year
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(knitr)
library(Rmisc)
## Loading required package: plyr
library(scales)
library(countrycode)
library(highcharter)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Highcharts (www.highcharts.com) is a Highsoft software product which is
## not free for commercial and Governmental use
library(glmnet)
## Loaded glmnet 4.0-2
library(keras)
library(forecast)
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(magrittr)
library(tidyverse)
## -- Attaching packages ------------------------------------- tidyverse 1.3.0 --
## v tibble 3.0.3 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## v purrr 0.3.4
## -- Conflicts ---------------------------------------- tidyverse_conflicts() --
## x dplyr::arrange() masks plyr::arrange()
## x lubridate::as.difftime() masks base::as.difftime()
## x readr::col_factor() masks scales::col_factor()
## x purrr::compact() masks plyr::compact()
## x dplyr::count() masks plyr::count()
## x lubridate::date() masks base::date()
## x lubridate::day() masks h2o::day()
## x purrr::discard() masks scales::discard()
## x tidyr::expand() masks Matrix::expand()
## x tidyr::extract() masks magrittr::extract()
## x dplyr::failwith() masks plyr::failwith()
## x dplyr::filter() masks stats::filter()
## x purrr::flatten() masks jsonlite::flatten()
## x lubridate::hour() masks h2o::hour()
## x dplyr::id() masks plyr::id()
## x lubridate::intersect() masks base::intersect()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
## x lubridate::month() masks h2o::month()
## x dplyr::mutate() masks plyr::mutate()
## x tidyr::pack() masks Matrix::pack()
## x dplyr::rename() masks plyr::rename()
## x purrr::set_names() masks magrittr::set_names()
## x lubridate::setdiff() masks base::setdiff()
## x dplyr::slice() masks xgboost::slice()
## x dplyr::summarise() masks plyr::summarise()
## x dplyr::summarize() masks plyr::summarize()
## x lubridate::union() masks base::union()
## x tidyr::unpack() masks Matrix::unpack()
## x lubridate::week() masks h2o::week()
## x lubridate::year() masks h2o::year()
library(ggplot2)
set.seed(0)
tr <- read_csv("./data/train.csv")
te <- read_csv("./data/test.csv")
subm <- read_csv("./data/sample_submission.csv")
## Train set file size: 1503430926 bytes
## Train set dimensions: 903653 12
## Rows: 903,653
## Columns: 12
## $ channelGrouping <chr> "Organic Search", "Organic Search", "Organic S...
## $ date <dbl> 20160902, 20160902, 20160902, 20160902, 201609...
## $ device <chr> "{\"browser\": \"Chrome\", \"browserVersion\":...
## $ fullVisitorId <chr> "1131660440785968503", "377306020877927890", "...
## $ geoNetwork <chr> "{\"continent\": \"Asia\", \"subContinent\": \...
## $ sessionId <chr> "1131660440785968503_1472830385", "37730602087...
## $ socialEngagementType <chr> "Not Socially Engaged", "Not Socially Engaged"...
## $ totals <chr> "{\"visits\": \"1\", \"hits\": \"1\", \"pagevi...
## $ trafficSource <chr> "{\"campaign\": \"(not set)\", \"source\": \"g...
## $ visitId <dbl> 1472830385, 1472880147, 1472865386, 1472881213...
## $ visitNumber <dbl> 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ visitStartTime <dbl> 1472830385, 1472880147, 1472865386, 1472881213...
## Test set file size: 1346845550 bytes
## Test set dimensions: 804684 12
## Rows: 804,684
## Columns: 12
## $ channelGrouping <chr> "Organic Search", "Organic Search", "Organic S...
## $ date <dbl> 20171016, 20171016, 20171016, 20171016, 201710...
## $ device <chr> "{\"browser\": \"Chrome\", \"browserVersion\":...
## $ fullVisitorId <chr> "6167871330617112363", "0643697640977915618", ...
## $ geoNetwork <chr> "{\"continent\": \"Asia\", \"subContinent\": \...
## $ sessionId <chr> "6167871330617112363_1508151024", "06436976409...
## $ socialEngagementType <chr> "Not Socially Engaged", "Not Socially Engaged"...
## $ totals <chr> "{\"visits\": \"1\", \"hits\": \"4\", \"pagevi...
## $ trafficSource <chr> "{\"campaign\": \"(not set)\", \"source\": \"g...
## $ visitId <dbl> 1508151024, 1508175522, 1508143220, 1508193530...
## $ visitNumber <dbl> 2, 1, 1, 1, 1, 1, 1, 1, 6, 1, 1, 1, 1, 1, 1, 1...
## $ visitStartTime <dbl> 1508151024, 1508175522, 1508143220, 1508193530...
As shown in the figure, there are only a few of the transactions after Jul 2017 in the train set, because the rest is in the test set. It makes sense to create time-based splits for train/validation sets.
head(te)
## # A tibble: 6 x 12
## channelGrouping date device fullVisitorId geoNetwork sessionId
## <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 Organic Search 2.02e7 "{\"b~ 616787133061~ "{\"conti~ 61678713~
## 2 Organic Search 2.02e7 "{\"b~ 064369764097~ "{\"conti~ 06436976~
## 3 Organic Search 2.02e7 "{\"b~ 605938381096~ "{\"conti~ 60593838~
## 4 Organic Search 2.02e7 "{\"b~ 237672007856~ "{\"conti~ 23767200~
## 5 Organic Search 2.02e7 "{\"b~ 231454452079~ "{\"conti~ 23145445~
## 6 Organic Search 2.02e7 "{\"b~ 413303988410~ "{\"conti~ 41330398~
## # ... with 6 more variables: socialEngagementType <chr>, totals <chr>,
## # trafficSource <chr>, visitId <dbl>, visitNumber <dbl>, visitStartTime <dbl>
unique(tr$socialEngagementType)
## [1] "Not Socially Engaged"
By seeing the values in columns "device", "geoNetwork", "trafficSource", "totals", we've found that they are in JSON format.
flatten_json <- . %>%
str_c(., collapse = ",") %>%
str_c("[", ., "]") %>%
fromJSON(flatten = T)
parse <- . %>%
bind_cols(flatten_json(.$device)) %>%
bind_cols(flatten_json(.$geoNetwork)) %>%
bind_cols(flatten_json(.$trafficSource)) %>%
bind_cols(flatten_json(.$totals)) %>%
select(-device, -geoNetwork, -trafficSource, -totals)
Let's convert train and test sets to the tidy format:
tr <- parse(tr)
te <- parse(te)
| channelGrouping | date | fullVisitorId | sessionId | socialEngagementType | visitId | visitNumber | visitStartTime | browser | browserVersion | browserSize | operatingSystem | operatingSystemVersion | isMobile | mobileDeviceBranding | mobileDeviceModel | mobileInputSelector | mobileDeviceInfo | mobileDeviceMarketingName | flashVersion | language | screenColors | screenResolution | deviceCategory | continent | subContinent | country | region | metro | city | cityId | networkDomain | latitude | longitude | networkLocation | campaign | source | medium | keyword | isTrueDirect | referralPath | adContent | campaignCode | adwordsClickInfo.criteriaParameters | adwordsClickInfo.page | adwordsClickInfo.slot | adwordsClickInfo.gclId | adwordsClickInfo.adNetworkType | adwordsClickInfo.isVideoAd | visits | hits | pageviews | bounces | newVisits | transactionRevenue |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Organic Search | 20160902 | 1131660440785968503 | 1131660440785968503_1472830385 | Not Socially Engaged | 1472830385 | 1 | 1472830385 | Chrome | not available in demo dataset | not available in demo dataset | Windows | not available in demo dataset | FALSE | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | desktop | Asia | Western Asia | Turkey | Izmir | (not set) | Izmir | not available in demo dataset | ttnet.com.tr | not available in demo dataset | not available in demo dataset | not available in demo dataset | (not set) | organic | (not provided) | NA | NA | NA | NA | not available in demo dataset | NA | NA | NA | NA | NA | 1 | 1 | 1 | 1 | 1 | NA | |
| Organic Search | 20160902 | 377306020877927890 | 377306020877927890_1472880147 | Not Socially Engaged | 1472880147 | 1 | 1472880147 | Firefox | not available in demo dataset | not available in demo dataset | Macintosh | not available in demo dataset | FALSE | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | desktop | Oceania | Australasia | Australia | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | dodo.net.au | not available in demo dataset | not available in demo dataset | not available in demo dataset | (not set) | organic | (not provided) | NA | NA | NA | NA | not available in demo dataset | NA | NA | NA | NA | NA | 1 | 1 | 1 | 1 | 1 | NA |
| channelGrouping | date | fullVisitorId | sessionId | socialEngagementType | visitId | visitNumber | visitStartTime | browser | browserVersion | browserSize | operatingSystem | operatingSystemVersion | isMobile | mobileDeviceBranding | mobileDeviceModel | mobileInputSelector | mobileDeviceInfo | mobileDeviceMarketingName | flashVersion | language | screenColors | screenResolution | deviceCategory | continent | subContinent | country | region | metro | city | cityId | networkDomain | latitude | longitude | networkLocation | campaign | source | medium | keyword | isTrueDirect | referralPath | adContent | adwordsClickInfo.criteriaParameters | adwordsClickInfo.page | adwordsClickInfo.slot | adwordsClickInfo.gclId | adwordsClickInfo.adNetworkType | adwordsClickInfo.isVideoAd | visits | hits | pageviews | newVisits | bounces |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Organic Search | 20171016 | 6167871330617112363 | 6167871330617112363_1508151024 | Not Socially Engaged | 1508151024 | 2 | 1508151024 | Chrome | not available in demo dataset | not available in demo dataset | Macintosh | not available in demo dataset | FALSE | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | desktop | Asia | Southeast Asia | Singapore | (not set) | (not set) | (not set) | not available in demo dataset | myrepublic.com.sg | not available in demo dataset | not available in demo dataset | not available in demo dataset | (not set) | organic | (not provided) | TRUE | NA | NA | not available in demo dataset | NA | NA | NA | NA | NA | 1 | 4 | 4 | NA | NA | |
| Organic Search | 20171016 | 0643697640977915618 | 0643697640977915618_1508175522 | Not Socially Engaged | 1508175522 | 1 | 1508175522 | Chrome | not available in demo dataset | not available in demo dataset | Windows | not available in demo dataset | FALSE | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | not available in demo dataset | desktop | Europe | Southern Europe | Spain | Aragon | (not set) | Zaragoza | not available in demo dataset | rima-tde.net | not available in demo dataset | not available in demo dataset | not available in demo dataset | (not set) | organic | (not provided) | NA | NA | NA | not available in demo dataset | NA | NA | NA | NA | NA | 1 | 5 | 5 | 1 | NA |
| fullVisitorId | PredictedLogRevenue |
|---|---|
| 0000000259678714014 | 0 |
| 0000049363351866189 | 0 |
| 0000053049821714864 | 0 |
| 0000059488412965267 | 0 |
| 0000085840370633780 | 0 |
setdiff(names(tr), names(te))
## [1] "campaignCode" "transactionRevenue"
tr %<>% select(-one_of("campaignCode"))
The test set lacks two columns. One column is a target variable transactionRevenue. The second column (campaignCode) we remove from the train set.
Let's find constant columns:
fea_uniq_values <- sapply(tr, n_distinct)
(fea_del <- names(fea_uniq_values[fea_uniq_values == 1]))
## [1] "socialEngagementType" "browserVersion"
## [3] "browserSize" "operatingSystemVersion"
## [5] "mobileDeviceBranding" "mobileDeviceModel"
## [7] "mobileInputSelector" "mobileDeviceInfo"
## [9] "mobileDeviceMarketingName" "flashVersion"
## [11] "language" "screenColors"
## [13] "screenResolution" "cityId"
## [15] "latitude" "longitude"
## [17] "networkLocation" "adwordsClickInfo.criteriaParameters"
## [19] "visits"
tr %<>% select(-one_of(fea_del))
te %<>% select(-one_of(fea_del))
dim(tr)
## [1] 903653 35
dim(te)
## [1] 804684 34
All these useless features we can safely remove.
After parsing of the JSON data we can observe many missing values in the data set. Let's find out how many missing values each feature has. We need to take into account that such values as "not available in demo dataset", "(not set)", "unknown.unknown", "(not provided)" can be treated as NA.
is_na_val <- function(x) x %in% c("not available in demo dataset", "(not provided)",
"(not set)", "<NA>", "unknown.unknown", "(none)")
tr %<>% mutate_all(funs(ifelse(is_na_val(.), NA, .)))
## Warning: `funs()` is deprecated as of dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
te %<>% mutate_all(funs(ifelse(is_na_val(.), NA, .)))
There exists a bunch of missing values.
We need to convert some features to their natural representation, including date, hits(integer, provides a record of all page visits), pageviews, bounces, newVisits, transactionRevenue.
tr %<>%
mutate(date = ymd(date),
hits = as.integer(hits),
pageviews = as.integer(pageviews),
bounces = as.integer(bounces),
newVisits = as.integer(newVisits),
transactionRevenue = as.numeric(transactionRevenue))
te %<>%
mutate(date = ymd(date),
hits = as.integer(hits),
pageviews = as.integer(pageviews),
bounces = as.integer(bounces),
newVisits = as.integer(newVisits))
As a target variable we use transactionRevenue which is a sub-column of the totals JSON column. It looks like this variable is multiplied by \(10^6\).
y <- tr$transactionRevenue
tr$transactionRevenue <- NULL
summary(y)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000e+04 2.493e+07 4.945e+07 1.337e+08 1.077e+08 2.313e+10 892138
We can safely replace NA values with 0.
y[is.na(y)] <- 0
summary(y)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000e+00 0.000e+00 0.000e+00 1.704e+06 0.000e+00 2.313e+10
The target variable has a wide range of values. Its distribution is right-skewed. For modelling we will use log-transformed target.
BTW, only 1.27% of all transactions have non-zero revenue:
The next figure shows that users who came via Affiliates and Social channels do not generate revenue. The most profitable channel is Referral:
Also usually first visit users generate more total revenue:
The revenue itself can be viewed as a timeseries. There seems to be a pattern of peaks.
There is an interesting separation in target variable by isTrueDirect feature:
Let's see if we can predict log-transformed mean daily revenue using timeseries. Here we use zoo and forecast packages for timeseries modelling:
tr %>%
bind_cols(tibble(revenue = y)) %>%
group_by(date) %>%
summarize(mean_revenue = log1p(mean(revenue/1e6))) %>%
ungroup() %>%
with(zoo(mean_revenue, order.by = date)) ->
revenue
h <- max(te$date) - min(te$date) + 1
revenue %>%
autoplot() +
geom_line() +
geom_smooth() +
labs(x = "", y = "log(revenue)") +
theme_minimal()
We use simple auto.arima model. We need to forecast for the period of 272 days.
m_aa <- auto.arima(revenue)
summary(m_aa)
## Series: revenue
## ARIMA(4,0,3) with non-zero mean
##
## Coefficients:
## ar1 ar2 ar3 ar4 ma1 ma2 ma3 mean
## -0.2547 -0.4445 0.3578 -0.0995 0.6029 0.5490 -0.3260 0.8924
## s.e. 0.3520 0.2285 0.2290 0.0782 0.3492 0.3354 0.3007 0.0249
##
## sigma^2 estimated as 0.1451: log likelihood=-162.27
## AIC=342.54 AICc=343.04 BIC=377.66
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -0.0003557683 0.3766821 0.2882868 -Inf Inf 0.790288 -0.004423992
forecast(m_aa, h = h) %>%
autoplot() +
theme_minimal()
Clearly, this model is of no use for long time period forecasting.
Let's add a regression term mean pageviews:
tr %>%
group_by(date) %>%
summarize(mean_pv = log1p(mean(pageviews, na.rm=TRUE))) %>%
ungroup() %$%
mean_pv ->
mean_pv_tr
te %>%
group_by(date) %>%
summarize(mean_pv = log1p(mean(pageviews, na.rm=TRUE))) %>%
ungroup() %$%
mean_pv ->
mean_pv_te
m_aa_reg <- auto.arima(revenue, xreg = mean_pv_tr)
summary(m_aa_reg)
## Series: revenue
## Regression with ARIMA(3,1,2) errors
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2 xreg
## 0.7110 -0.2203 -0.0852 -1.3928 0.4779 2.4881
## s.e. 0.2007 0.0800 0.0688 0.1959 0.1750 0.3198
##
## sigma^2 estimated as 0.1216: log likelihood=-131.05
## AIC=276.11 AICc=276.42 BIC=303.41
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.01178035 0.345297 0.2658876 -Inf Inf 0.7288843 -0.004276301
forecast(m_aa_reg, h = h, xreg = mean_pv_te) %>%
autoplot() +
theme_minimal()
The dataset contains the timestamp column visitStartTime expressed as POSIX time. It allows us to create a bunch of features. Let's check symmetric differences of the time features from the train and test sets.
tr_vst <- as_datetime(tr$visitStartTime)
te_vst <- as_datetime(te$visitStartTime)
symdiff <- function(x, y) setdiff(union(x, y), intersect(x, y))
Year:
symdiff(tr_vst %>% year %>% unique, te_vst %>% year %>% unique)
## [1] 2016 2018
Month:
symdiff(tr_vst %>% month %>% unique, te_vst %>% month %>% unique)
## [1] 6 7
Day:
symdiff(tr_vst %>% day %>% unique, te_vst %>% day %>% unique)
## integer(0)
Week:
symdiff(tr_vst %>% week %>% unique, te_vst %>% week %>% unique)
## [1] 25 24 26 20 22 27 19 28 23 30 21 29
Day of the year:
symdiff(tr_vst %>% yday %>% unique, te_vst %>% yday %>% unique)
## [1] 174 175 164 165 172 173 122 162 163 182 181 139 140 176 150 149 136 135 213
## [20] 168 169 167 366 186 185 131 132 179 178 166 195 196 157 158 205 206 143 142
## [39] 123 124 159 171 210 209 204 145 146 138 141 187 188 203 202 126 127 152 153
## [58] 134 128 129 170 161 160 156 125 184 192 193 183 148 154 155 144 208 207 201
## [77] 130 189 133 137 194 200 212 190 151 177 211 147 191 199 197 198 180
Hour:
symdiff(tr_vst %>% hour %>% unique, te_vst %>% hour %>% unique)
## integer(0)
We can see that some time features (week, month, day of the year) from the train and test sets differ notably. Thus, they can cause overfitiing, but year and hour can be useful.